## Stephen Moon
## Statistics Senior Thesis

library(dplyr)
library(plyr)


setwd("/Users/smoon/Desktop/Thesis/Data/")
results <- read.csv("HouseElectionResults.csv", header = T, as.is = T)
demo <- read.csv("CongressDemo.csv", header = T, as.is = T)

sampledata <- subset(results, results$state == "Connecticut" & results$year == 2018)

uniqueResultYears <- unique(results$year)
#note that the year ???? corresponds to the 95th US Congress
#drop all demographic info before 95th congress because we don't have results

demo <- demo[!is.na(demo$congNum) & demo$congNum >= 95,]
uniqueCongresses <- unique(demo$congNum)

#add a year variable to demographic data
demoYears <- uniqueResultYears[0:length(uniqueCongresses)]
years <- 1:length(demo$congNum)
for(i in 1:length(uniqueCongresses)) {
  years[which(demo$congNum == uniqueCongresses[i])] <- demoYears[i]
}
demo$year <- years

#give both datasets the same coding for districts
results$stateDist <- paste(results$state_po, results$district, sep = ".")

results$distYear <- paste(results$year, results$stateDist, sep = "_")
demo$distYear <- paste(demo$year, demo$stateDist, sep = "_")

#set up results in a better format to prepare for merge
races <- unique(results[c("year", "stateDist")])
races$distYear <- paste(races$year, races$stateDist, sep = "_")

#races[names(results)[!(names(results) %in% names(races))]] <- NA

#FOR NOW I am ignoring runoffs (only 2, and both pre-2006)
#Am only recording D and R vote shares unless there is a substantial reason not to

#reshape results
elections <- unique(races$distYear)
N <- length(elections)

#here, create vectors to store dem, rep, total vote count, and other info
dCandidates <- rep(NA, N)
rCandidates <- rep(NA, N)
dVotes <- rep(NA, N)
rVotes <- rep(NA, N)
totalVotes <- rep(NA, N)
special <- rep(NA, N)
state <- rep(NA, N)
twosameparty <- rep(F, N)
otherMajors <- rep(F, N)

#multiple of same party list
multiples <- c()

#treat counts as numeric (as they should be)
results$candidatevotes <- as.numeric(results$candidatevotes)
results$totalvotes <- as.numeric(results$totalvotes)

otherThresholdVotes <- 0.05

#for each race
for(i in 1:N) {
  election <- elections[i]
  #pull out the rows for the specific election
  result.idxs <- which(results$distYear == election)
  result <- results[result.idxs,]
  n <- length(result)
  
  totalVotes[i] <- result$totalvotes[1]
  state[i] <- result$state[1]
  special[i] <- result$special[1]
  
  #get Dems
  dems.idx <- which(result$party == "democrat")
  rep.idx <- which(result$party == "republican")
  other <- result[-c(dems.idx, rep.idx),]
  
  if(length(dems.idx) > 1) {
    multiples <- c(multiples, result$distYear[1])
    twosameparty[i] <- T
  }
  
  if(length(rep.idx) > 1) {
    multiples <- c(multiples, result$distYear[1])
    twosameparty[i] <- T
  }
  
  if(length(dems.idx) > 0) {
    dCandidates[i] <- result$candidate[dems.idx[1]]
    dVotes[i] <- result$candidatevotes[dems.idx[1]]
  }
  
  if(length(rep.idx) > 0) {
    rCandidates[i] <- result$candidate[rep.idx[1]]
    rVotes[i] <- result$candidatevotes[rep.idx[1]]
  } 
  
  if(nrow(other) > 0) {
    #see if any candidates had at least 5% of vote
    otherMajor <- which((other$candidatevotes/other$totalvotes) >= otherThresholdVotes)
    uniqueCandidate <- F
    if(length(otherMajor) > 1) {
      uniqueCandidate <- T
      otherMajor <- other[otherMajor,]
      #ensure that candidate not an existing candidate on another party line
      for(j in 1:nrow(otherMajor)) {
        major <- otherMajor[j,]
        if(!is.na(major$candidate) &
           ((!is.na(rCandidates[i]) & major$candidate == rCandidates[i]) |
            (!is.na(dCandidates[i]) & major$candidate == dCandidates[i]))) {
          uniqueCandidate <- F
        }
      }
    }
    otherMajors[i] <- uniqueCandidate
  }
}

races$state <- state
races$democrat <- dCandidates
races$dVotes <- dVotes
races$republican <- rCandidates
races$rVotes <- rVotes
races$totalVotes <- totalVotes
races$special <- special
races$twosameparty <- twosameparty
races$otherMajor <- otherMajors

write.csv(races, "Races.csv")
write.csv(multiples, "MultipleCandidateRaces.csv")

#races through 2014
#to get through 2018, add "all.x = TRUE"
#but for some reason this adds about 200 rows
#not sure what's up with that...
#so we're just gonna ignore that for now
races14 <- merge(races, demo, by = "distYear")
write.csv(races14, "RacesThrough2014.csv")

###########
#read in DIME data
DIME <- read.csv("dime_cong_elections_current.csv", head = T, as.is = T)

#convert races14 codes to DIME codes
races14$DIME <- NA
for(i in 1:length(races14$DIME)) {
  #get state
  state <- races14$state.y[i]
  #get dist no.
  distno <- substr(races14$stateDist.x[i], 4, nchar(races14$stateDist.x[i]))
  if(distno == "0") {
    distno <- "1"
  }
  if(nchar(distno) < 2) {
    distno <- paste("0", distno, sep = "")
  }
  races14$DIME[i] <- paste(state, distno, sep = "")
}

races14$DIME <- paste(races14$year.x, races14$DIME, sep = "_")

#first, get only house races with primary winners
house <- subset(DIME, DIME$pwinner == "W" & DIME$seat == "federal:house")

##### REMOVE THIS LATER #######
house <- subset(house, house$cycle <= 2012)

#make a distyear code like above
house$distYear <- paste(house$cycle, house$district, sep = "_")

#now, go election cycle by election cycle and attempt to match candidates
districts <- unique(house$distYear)
N <- nrow(races14)

#update these
races14$dIncumChall <- rep(NA, N)
races14$dGender <- rep(NA, N)
races14$dRecipScore <- rep(NA, N)
races14$dContribScore <- rep(NA, N)
races14$dDWNom <- rep(NA, N)
races14$dNumDonors <- rep(NA, N)
races14$dTotalReceipts <- rep(NA, N)
races14$dCandContrib <- rep(NA, N)
races14$dPACContrib <- rep(NA, N)
races14$dUnitemized <- rep(NA, N)
races14$dDisbursements <- rep(NA, N)
races14$dppct <- rep(NA, N)
races14$dNumPrimaryOpps <- rep(NA, N)
races14$DemPresVs <- rep(NA, N)

races14$rIncumChall <- rep(NA, N)
races14$rGender <- rep(NA, N)
races14$rRecipScore <- rep(NA, N)
races14$rContribScore <- rep(NA, N)
races14$rDWNom <- rep(NA, N)
races14$rNumDonors <- rep(NA, N)
races14$rTotalReceipts <- rep(NA, N)
races14$rCandContrib <- rep(NA, N)
races14$rPACContrib <- rep(NA, N)
races14$rUnitemized <- rep(NA, N)
races14$rDisbursements <- rep(NA, N)
races14$rppct <- rep(NA, N)
races14$rNumPrimaryOpps <- rep(NA, N)

startNumD <- which(names(races14) == "dIncumChall")
endNumD <- which(names(races14) == "dNumPrimaryOpps")

startNumR <- which(names(races14) == "rIncumChall")
endNumR <- which(names(races14) == "rNumPrimaryOpps")

############################
#take this out later
problem.districts <- c()

for(i in 1:N) {
  district <- races14$DIME[i]
  record <- races14[i,]
  DIME.dist <- house[which(house$distYear == district),]
  
  dem.name <- record$democrat[1]
  rep.name <- record$republican[1]
  
  races14[i, endNumD + 1] <- DIME.dist$dem_pres_vs[1]
  
  #do the dems
  if(nrow(record) > 0 & sum(!is.na(record$democrat)) > 0) {
    DIME.dist.dems <- DIME.dist[which(DIME.dist$party == "D"),]
    for(j in 1:nrow(DIME.dist.dems)) {
      row <- DIME.dist.dems[j,]
      
      #get the last name
      lname <- strsplit(row$Name, split = ",", fixed = T)
      
      #make sure the string splitting worked
      if(!is.null(lname) & length(lname) != 0) {
        lname <- lname[[1]][1]
        #see if the last name matches the person in races14
        in.name <- grepl(lname, dem.name, ignore.case = TRUE)
        
        #if it does, copy over the values of interest
        if(!is.na(in.name) & in.name) {
          row.to.insert.D <- c(row$Incum_Chall[1], row$cand_gender[1], row$recipient_cfscore[1],
                             row$contributor_cfscore[1], row$dwnom1[1], row$num_distinct_donors[1],
                             row$total_receipts[1], row$contribs_from_candidate[1], 
                             row$total_pac_contribs[1], row$unitemized[1], row$total_disbursements[1],
                             row$ppct[1], row$num_prim_opps[1])
          #use which to get the magic numbers, but do this OUTSIDE the loop
          races14[i, startNumD:endNumD] <- row.to.insert.D
        }
      }
    }
  }
  
  #now do the republicans
  if(nrow(record) > 0 & sum(!is.na(record$republican)) > 0) {
    DIME.dist.gop <- DIME.dist[which(DIME.dist$party == "R"),]
    for(j in 1:nrow(DIME.dist.gop)) {
      row <- DIME.dist.gop[j,]
      #get the last name
      lname <- strsplit(row$Name, split = ",", fixed = T)
      
      #make sure the string splitting worked
      if(!is.null(lname) & length(lname) != 0) {
        lname <- lname[[1]][1]
        #see if the last name matches the person in races14
        in.name <- grepl(lname, rep.name, ignore.case = TRUE)
        
        #if it does, copy over the values of interest
        if(!is.na(in.name) & in.name) {
          row.to.insert.R <- c(row$Incum_Chall[1], row$cand_gender[1], row$recipient_cfscore[1],
                               row$contributor_cfscore[1], row$dwnom1[1], row$num_distinct_donors[1],
                               row$total_receipts[1], row$contribs_from_candidate[1], 
                               row$total_pac_contribs[1], row$unitemized[1], row$total_disbursements[1],
                               row$ppct[1], row$num_prim_opps[1])
          #use which to get the magic numbers, but do this OUTSIDE the loop
          races14[i, startNumR:endNumR] <- row.to.insert.R
        }
      }
    }
  }
  
}

races14.DIME <- races14
#DIME data has nothing before 1980
races14.DIME <- subset(races14.DIME, as.numeric(races14.DIME$year.x) >= 1980)

races14.DIME$uncontested <- as.numeric(is.na(races14.DIME$democrat)) + 
    as.numeric(is.na(races14.DIME$republican))

races14.DIME$uncontested <- races14.DIME$uncontested * 
                            as.numeric(!races14.DIME$twosameparty)

races14.DIME <- races14.DIME[-which(races14.DIME$uncontested == 2),]

races14.DIME$uncontested <- races14.DIME$uncontested * as.numeric(!races14.DIME$otherMajor)
races14.DIME$year.x <- as.factor(races14.DIME$year.x)

#define a variable for the winner
races14.DIME$Rwinner <- is.na(races14.DIME$dVotes) | races14.DIME$rVotes > races14.DIME$dVotes

races14.DIME$winMargin <- NA
races14.DIME$DemWinMargin <- NA
for(i in 1:nrow(races14.DIME)) {
  if(!is.na(races14.DIME$dVotes[i]) & !is.na(races14.DIME$rVotes[i])) {
    win <- as.numeric(races14.DIME$dVotes[i]) - as.numeric(races14.DIME$rVotes[i])
    DemWinMargin <- win/races14.DIME$totalVotes[i]
    absMargin <- abs(DemWinMargin)
    races14.DIME$winMargin[i] <- absMargin
    races14.DIME$DemWinMargin[i] <- DemWinMargin
    }
}

#recode DemPresVs to be absolute deviation from 50 
races14.DIME$PresDiff50 <- abs(races14.DIME$DemPresVs - 0.5)

#lag variables
#having issues with ddply so do this manually for now
races14.DIME <- ddply(races14.DIME, ~stateDist.x, transform, 
                 lagUncontested = lag(uncontested, order_by = year.x),
                 lagPresDiff50 = lag(PresDiff50, order_by = year.x),
                 lagDemPresVs = lag(DemPresVs, order_by = year.x),
                 lagWinMargin = lag(winMargin, order_by = year.x),
                 lagRWinner = lag(Rwinner, order_by = year.x),
                 lagDemWinMargin = lag(DemWinMargin, order_by = year.x))

#for now, use absolute win margin for DemPresVs
races14.DIME$LagDemPresVsSqd <- races14.DIME$lagDemPresVs^2

write.csv(races14.DIME, "RacesWithDIME.csv")

#other <- subset(results, !(results$party == "democrat" | results$party == "republican"))
#other <- subset(other, other$candidatevotes >= 0.05*other$totalvotes)
